Load libraries


#### Load packages ####
library(camprotR)
library(ggplot2)
library(tidyverse)
library(MSnbase)
library(biobroom)

Read in the PSM data

psm_res <- readRDS('../results/psm_res.rds')
psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  group_by(method) %>%
  summarise(median_sn=10^median(log10(Average.Reporter.SN), na.rm=TRUE))
`summarise()` ungrouping output (override with `.groups` argument)
129.15/42
[1] 3.075
p <- psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  ggplot(aes(Average.Reporter.SN, colour=method)) +
    geom_density() +
    theme_camprot()

print(p)

print(p + scale_x_log10())


print(p + aes(Isolation.Interference.in.Percent))

Plotting the proportion of missing values


psm_res %>% names() %>% lapply(function(x){
  
  all <- psm_res[[x]]
  hs <- all[fData(all)$species=='H.sapiens']
  sc <- all[fData(all)$species=='S.cerevisiae']
  
  slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
  for(slice in names(slices)){
    p <- slices[[slice]] %>% plot_missing_SN() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  
    p <- slices[[slice]] %>% plot_missing_SN_per_sample() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  }
  return(NULL)
})
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
[[1]]
NULL

[[2]]
NULL

OK, so essentially all the missing values are restricted to PSMs with low (<20) Signal:Noise ratios.


source('./R/get_quant_vs_mean.R')


quant_vs_mean <- psm_res %>% lapply(get_quant_vs_mean) 

quant_vs_mean <- quant_vs_mean %>% lapply(function(x){
  x %>%
    mutate(binned_interference=Hmisc::cut2(
      Isolation.Interference.in.Percent, cuts=c(0,1,5,10,seq(20,100,20))),
      binned_q=Hmisc::cut2(Percolator.q.Value, cuts=c(0, 0.001, 0.002, 0.005, 0.01)),
      binned_average_sn=Hmisc::cut2(Average.Reporter.SN, cuts=c(0,10,20,30,40,60,100)),
      binned_intensity=Hmisc::cut2(intensity, cuts=c(0,10,20,30,40,60,100)))
})
quant_vs_mean %>% lapply(dim)
$`AGC: 2E5`
[1] 1980560      23

$`AGC: 5E4`
[1] 2166658      23
exp_design <- pData(psm_res$`AGC: 2E5`) %>%
  select(condition, S.cerevisiae=yeast, H.sapiens=human) %>%
  unique()
  
sc_spikes <- exp_design$S.cerevisiae
hs_spikes <- exp_design$H.sapiens

get_ground_truth <- function(sc_spikes, hs_spikes, ix_1, ix_2){
  comparison <- sprintf('%s vs %s', sc_spikes[ix_2], sc_spikes[ix_1])
  hs_ground_truth <- hs_spikes[ix_2]/hs_spikes[ix_1]
  sc_ground_truth <- sc_spikes[ix_2]/sc_spikes[ix_1]
  return(c(comparison, hs_ground_truth, sc_ground_truth))
}
library(gtools)

expected <- apply(permutations(n=3,r=2), 1, function(x){
  get_ground_truth(sc_spikes, hs_spikes, x[1], x[2])
}) %>% t() %>% data.frame() %>%
  setNames(c('comparison', 'H.sapiens', 'S.cerevisiae')) %>%
  mutate_at(vars(S.cerevisiae, 
                 H.sapiens), 
            funs(as.numeric)) %>%
  pivot_longer(-comparison, names_to='species', values_to='expected')
`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
print(expected)

positive_comparisons <- expected %>% filter(species=='S.cerevisiae', expected>1) %>%
  pull(comparison)
quant_vs_mean %>% names() %>% lapply(function(x){
  for(sp in c("S.cerevisiae", "H.sapiens")){
    p <- quant_vs_mean[[x]] %>%
      filter(species==sp, comparison %in% positive_comparisons) %>%
      ggplot(aes(binned_interference, diff, colour=below_notch)) +
      geom_violin() +
      theme_camprot(base_size=10) +
      theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
      facet_wrap(~comparison, scales='free') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[(expected$species==sp &
                                  expected$comparison %in% positive_comparisons),],
                 colour='grey', linetype=2) +
      xlab('Interference') +
      ylab('Difference in intensity') +
      theme(aspect.ratio=.3) +
      ggtitle(x)
    
    print(p)
  }
  
  p <- quant_vs_mean[[x]] %>%
    filter(species=='S.cerevisiae', Isolation.Interference.in.Percent<=60,
           comparison %in% positive_comparisons) %>%
    ggplot(aes(diff, colour=binned_interference)) +
    geom_density() +
    theme_camprot(base_size=10) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(below_notch~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour='grey', linetype=2) +
    xlab('Difference in intensity') +
    ylab('Density') +
    xlim(-6,3) +
    theme(aspect.ratio=2) +
    ggtitle(x)
  
  print(p)
})
[[1]]

[[2]]

quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=50, # no need to consider interference>=50%
           comparison %in% positive_comparisons) %>% 
    filter(species=='S.cerevisiae', !below_notch) %>%
    ggplot(aes(diff, colour=binned_intensity)) +
    geom_density() +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(binned_interference~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour=get_cat_palette(1), linetype=2) +
    ylab('Density') +
    xlab('Difference in intensity') +
    xlim(-6,3) +
    ggtitle(x) +
    scale_colour_discrete(name='Intensity')
  
  print(p)
  print(p + aes(colour=binned_interference) + facet_grid(binned_average_sn~comparison) +
    scale_colour_discrete(name='Interference (%)'))
  
  print(p + aes(colour=binned_q) + scale_colour_discrete(name='Percolator Q'))
  print(p + aes(colour=binned_interference) + facet_grid(binned_q~comparison) +
          scale_colour_discrete(name='Interference (%)'))

  
  return(NULL)
})
[[1]]
NULL

[[2]]
NULL

quant_vs_mean %>% names() %>% lapply(function(x){
    p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE), n=length(diff)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    theme_camprot(base_size=15) +
    scale_fill_gradient(high=get_cat_palette(2)[2],
                        low='white',
                        limits=c(0, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    xlab('Binned interference') +
    ylab('Binned intensity') +
    ggtitle(x)
    
    print(p + geom_text(aes(label=round(median_diff, 1)), size=3))
    
    print(p +
            aes(fill=n) +
            scale_fill_gradient(high=get_cat_palette(3)[3],
                                low='white') +
            geom_text(aes(label=n), size=3) )
})
`summarise()` regrouping output by 'binned_interference' (override with `.groups` argument)
[[1]]

[[2]]

quant_vs_mean %>% names() %>% lapply(function(x){
    quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_q, binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    facet_wrap(~binned_q) +
    theme_camprot(base_size=15) +
    scale_fill_gradient2(high=get_cat_palette(2)[2],
                         low=get_cat_palette(2)[1],
                         mid='white', midpoint=0,
                        limits=c(-1, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1),
          panel.background=element_rect(fill="grey")) +
    xlab('Binned interference') +
    ylab('Binned Intensity') +
    ggtitle(x)
})
`summarise()` regrouping output by 'binned_q', 'binned_interference' (override with `.groups` argument)
`summarise()` regrouping output by 'binned_q', 'binned_interference' (override with `.groups` argument)
[[1]]

[[2]]


quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    select(id, species, binned_q, binned_average_sn, binned_interference) %>%
    unique() %>%
    group_by(species, binned_q, binned_average_sn, binned_interference) %>%
    tally() %>%
    ggplot(aes(binned_interference, n)) +
    geom_bar(stat='identity') +
    facet_wrap(~species, scales='free') +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    ggtitle(x)
  
  print(p)
  print(p + aes(binned_q))
  print(p + aes(binned_average_sn) +
          xlab('Signal/Noise'))
  
  return(NULL)
})
[[1]]
NULL

[[2]]
NULL

Based on the above, I’m going to use the following thresholds:

For now, we won’t filer using SN since we want to explore the impact of the notch on fold changes in more detail first

First though, let’s filter by Percolator Q value or Isolation interence alone and check how isolation interference affects PSM-level fold change estimates.

quant_vs_mean %>% names() %>% lapply(function(x){
  
  to_plot_q_flt <- quant_vs_mean[[x]] %>%
    filter(Percolator.q.Value<=0.005) %>%
    filter(species=='S.cerevisiae') %>%
    filter(Isolation.Interference.in.Percent<50) %>%
    arrange(Isolation.Interference.in.Percent)
  
  to_plot_interference_flt <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=10) %>%
    filter(species=='S.cerevisiae') %>%
    arrange(Percolator.q.Value)
  
  p <- to_plot_q_flt %>%
    ggplot(aes(log2(intensity), diff)) +
    geom_point(size=0.1, alpha=0.1, colour='grey80') +
    theme_camprot(base_size=12) +
    facet_wrap(~comparison, scales='free_y') +
    geom_hline(aes(yintercept=log2(expected)),
               data=expected[expected$species=='S.cerevisiae',],
               colour='black', linetype=2) +
    xlab('Tag intensity (log2)') +
    ylab('Difference in intensity (log2)') +
    scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Isolation interference (%)') +
    ggtitle(x)
  
  print(p)
  print(p + geom_smooth(aes(colour=binned_interference), se=FALSE, size=0.5))
  print(p %+% to_plot_interference_flt +
          geom_smooth(aes(colour=binned_q), se=FALSE, size=0.5) +
          scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Percolator Q value'))

  return(NULL)
})
[[1]]
NULL

[[2]]
NULL

Let’s plot the intensity vs difference in intensity for all comparisons and filtering schemes.

quant_vs_mean %>% names() %>% lapply(function(x){
  
    tmp_data <-  quant_vs_mean[[x]] %>%
      filter(species!='mixed', !comparison %in% positive_comparisons)
    
    p <- tmp_data %>%
      ggplot(aes(log2(intensity), diff)) +
      geom_point(size=0.1, alpha=0.05, colour='grey10') +
      geom_density_2d(size=0.25, colour=get_cat_palette(1)) +
      theme_camprot(base_size=12) +
      facet_grid(species~comparison, scales='free_y') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[!expected$comparison %in% positive_comparisons,],
                 colour='black', linetype=2) +
      xlab('Tag intensity (log2)') +
      ylab('Difference in intensity (log2)') +
      ggtitle(x) +
      coord_cartesian(ylim=c(-4,4))
  
    print(p)

    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005)))
    print(p %+% (tmp_data %>%
                   filter(Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10, Average.Reporter.SN>10)))
    return(NULL)
})
[[1]]
NULL

[[2]]
NULL

Now, let’s filter the PSMs against these thresholds.

psm_res_flt <- psm_res %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=0)
  
  out <- out[fData(out)$Percolator.q.Value<=0.001,]
  camprotR:::message_parse(fData(out),
                           'Master.Protein.Accessions',
                           "Percolator Q value filtering")
  out
})
Filtering PSMs...
99650 features found from 10512 master proteins => No quant filtering
72364 features found from 9674 master proteins => Co-isolation filtering
72364 features found from 9674 master proteins => S:N ratio filtering
56814 features found from 8696 master proteins => Percolator Q value filtering
Filtering PSMs...
109197 features found from 10974 master proteins => No quant filtering
77629 features found from 10061 master proteins => Co-isolation filtering
77629 features found from 10061 master proteins => S:N ratio filtering
62610 features found from 9164 master proteins => Percolator Q value filtering
psm_res_flt_sn <- psm_res_flt %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=10)
  out
})
Filtering PSMs...
56814 features found from 8696 master proteins => No quant filtering
56814 features found from 8696 master proteins => Co-isolation filtering
55292 features found from 8619 master proteins => S:N ratio filtering
Filtering PSMs...
62610 features found from 9164 master proteins => No quant filtering
62610 features found from 9164 master proteins => Co-isolation filtering
57787 features found from 8965 master proteins => S:N ratio filtering

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch() +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})
[[1]]
NULL

[[2]]
NULL

Per-tag notch plots


psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch(facet_by_sample=TRUE) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})
[[1]]
NULL

[[2]]
NULL

Tallies for fraction sub-notch PSMs per protein


psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      
      notch_per_protein <- get_notch_per_protein(slices[[slice]]) %>%
        filter(fraction_below>0)
      
      p <- plot_fraction_below_notch_per_prot(notch_per_protein) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      
      print(p)
      }
  }
  
  return(NULL)
  
})
[[1]]
NULL

[[2]]
NULL

Missing values frequencies.

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      plotNA(slices[[slice]], pNA = 0)
    }
  }
  
  return(NULL)
  
})
[[1]]
NULL

[[2]]
NULL

Save out objects for downstream notebooks

saveRDS(quant_vs_mean, '../results/quant_vs_mean.rds')
saveRDS(psm_res_flt, '../results/psm_res_flt.rds')
saveRDS(psm_res_flt_sn, '../results/psm_res_flt_sn.rds')
saveRDS(expected, '../results/expected.rds')
---
title: 'Filter PSMs'
author:
  - name: "Tom Smith"
    affiliation: "Cambridge Centre for Proteomics"
date: "`r format(Sys.time(), '%d %B, %Y')`"
abstract: | 
  Here, we filter the PSM-level PD output, with thresholds informed by missing values,
  notch prominence and observed fold changes vs ground truths.
output:
  pdf_document:
  html_notebook: default
geometry: margin=1in
fontfamily: mathpazo
fontsize: 11pt
---

Load libraries
```{r setup, message=FALSE}

#### Load packages ####
library(camprotR)
library(ggplot2)
library(tidyverse)
library(MSnbase)
library(biobroom)
```

Read in the PSM data
```{r}
psm_res <- readRDS('../results/psm_res.rds')
```

```{r}
psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  group_by(method) %>%
  summarise(median_sn=10^median(log10(Average.Reporter.SN), na.rm=TRUE))
129.15/42
```

```{r}
p <- psm_res %>% names() %>% lapply(function(x){
  psm_res[[x]] %>%
    fData() %>%
    mutate(method=x)
}) %>%
  do.call(what='rbind') %>%
  ggplot(aes(Average.Reporter.SN, colour=method)) +
    geom_density() +
    theme_camprot()

print(p)
print(p + scale_x_log10())

print(p + aes(Isolation.Interference.in.Percent))
```

Plotting the proportion of missing values 
```{r}

psm_res %>% names() %>% lapply(function(x){
  
  all <- psm_res[[x]]
  hs <- all[fData(all)$species=='H.sapiens']
  sc <- all[fData(all)$species=='S.cerevisiae']
  
  slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
  for(slice in names(slices)){
    p <- slices[[slice]] %>% plot_missing_SN() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  
    p <- slices[[slice]] %>% plot_missing_SN_per_sample() +
      ggtitle(sprintf('%s - %s', x, slice))
    print(p)
  }
  return(NULL)
})
```
OK, so essentially all the missing values are restricted to PSMs with low (<20) Signal:Noise ratios.


```{r}

source('./R/get_quant_vs_mean.R')


quant_vs_mean <- psm_res %>% lapply(get_quant_vs_mean) 

quant_vs_mean <- quant_vs_mean %>% lapply(function(x){
  x %>%
    mutate(binned_interference=Hmisc::cut2(
      Isolation.Interference.in.Percent, cuts=c(0,1,5,10,seq(20,100,20))),
      binned_q=Hmisc::cut2(Percolator.q.Value, cuts=c(0, 0.001, 0.002, 0.005, 0.01)),
      binned_average_sn=Hmisc::cut2(Average.Reporter.SN, cuts=c(0,10,20,30,40,60,100)),
      binned_intensity=Hmisc::cut2(intensity, cuts=c(0,10,20,30,40,60,100)))
})
```


```{r}
quant_vs_mean %>% lapply(dim)
```



```{r}
exp_design <- pData(psm_res$`AGC: 2E5`) %>%
  select(condition, S.cerevisiae=yeast, H.sapiens=human) %>%
  unique()
  
sc_spikes <- exp_design$S.cerevisiae
hs_spikes <- exp_design$H.sapiens

get_ground_truth <- function(sc_spikes, hs_spikes, ix_1, ix_2){
  comparison <- sprintf('%s vs %s', sc_spikes[ix_2], sc_spikes[ix_1])
  hs_ground_truth <- hs_spikes[ix_2]/hs_spikes[ix_1]
  sc_ground_truth <- sc_spikes[ix_2]/sc_spikes[ix_1]
  return(c(comparison, hs_ground_truth, sc_ground_truth))
}
library(gtools)

expected <- apply(permutations(n=3,r=2), 1, function(x){
  get_ground_truth(sc_spikes, hs_spikes, x[1], x[2])
}) %>% t() %>% data.frame() %>%
  setNames(c('comparison', 'H.sapiens', 'S.cerevisiae')) %>%
  mutate_at(vars(S.cerevisiae, 
                 H.sapiens), 
            funs(as.numeric)) %>%
  pivot_longer(-comparison, names_to='species', values_to='expected')

print(expected)

positive_comparisons <- expected %>% filter(species=='S.cerevisiae', expected>1) %>%
  pull(comparison)
```
```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
  for(sp in c("S.cerevisiae", "H.sapiens")){
    p <- quant_vs_mean[[x]] %>%
      filter(species==sp, comparison %in% positive_comparisons) %>%
      ggplot(aes(binned_interference, diff, colour=below_notch)) +
      geom_violin() +
      theme_camprot(base_size=10) +
      theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
      facet_wrap(~comparison, scales='free') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[(expected$species==sp &
                                  expected$comparison %in% positive_comparisons),],
                 colour='grey', linetype=2) +
      xlab('Interference') +
      ylab('Difference in intensity') +
      theme(aspect.ratio=.3) +
      ggtitle(x)
    
    print(p)
  }
  
  p <- quant_vs_mean[[x]] %>%
    filter(species=='S.cerevisiae', Isolation.Interference.in.Percent<=60,
           comparison %in% positive_comparisons) %>%
    ggplot(aes(diff, colour=binned_interference)) +
    geom_density() +
    theme_camprot(base_size=10) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(below_notch~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour='grey', linetype=2) +
    xlab('Difference in intensity') +
    ylab('Density') +
    xlim(-6,3) +
    theme(aspect.ratio=2) +
    ggtitle(x)
  
  print(p)
})
```
```{r, fig.height=8, fig.width=8, warning=FALSE}
quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=50, # no need to consider interference>=50%
           comparison %in% positive_comparisons) %>% 
    filter(species=='S.cerevisiae', !below_notch) %>%
    ggplot(aes(diff, colour=binned_intensity)) +
    geom_density() +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    facet_grid(binned_interference~comparison, scales='free') +
    geom_vline(aes(xintercept=log2(expected)),
               data=expected[(expected$species=='S.cerevisiae' &
                                expected$comparison %in% positive_comparisons),],
               colour=get_cat_palette(1), linetype=2) +
    ylab('Density') +
    xlab('Difference in intensity') +
    xlim(-6,3) +
    ggtitle(x) +
    scale_colour_discrete(name='Intensity')
  
  print(p)
  print(p + aes(colour=binned_interference) + facet_grid(binned_average_sn~comparison) +
    scale_colour_discrete(name='Interference (%)'))
  
  print(p + aes(colour=binned_q) + scale_colour_discrete(name='Percolator Q'))
  print(p + aes(colour=binned_interference) + facet_grid(binned_q~comparison) +
          scale_colour_discrete(name='Interference (%)'))

  
  return(NULL)
})
```

```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
    p <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE), n=length(diff)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    theme_camprot(base_size=15) +
    scale_fill_gradient(high=get_cat_palette(2)[2],
                        low='white',
                        limits=c(0, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    xlab('Binned interference') +
    ylab('Binned intensity') +
    ggtitle(x)
    
    print(p + geom_text(aes(label=round(median_diff, 1)), size=3))
    
    print(p +
            aes(fill=n) +
            scale_fill_gradient(high=get_cat_palette(3)[3],
                                low='white') +
            geom_text(aes(label=n), size=3) )
})
```

```{r, fig.height=6, fig.width=6}
quant_vs_mean %>% names() %>% lapply(function(x){
    quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=60) %>% # no need to consider interference>=60%
    filter(species=='S.cerevisiae', !below_notch, comparison=='6 vs 1') %>%
    group_by(binned_q, binned_interference, binned_intensity) %>%
    summarise(median_diff=2^median(diff, na.rm=TRUE)) %>%
    ggplot(aes(binned_interference, binned_intensity, fill=median_diff)) +
    geom_tile(colour='grey') +
    facet_wrap(~binned_q) +
    theme_camprot(base_size=15) +
    scale_fill_gradient2(high=get_cat_palette(2)[2],
                         low=get_cat_palette(2)[1],
                         mid='white', midpoint=0,
                        limits=c(-1, 6), name='Observed\nfold change') +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1),
          panel.background=element_rect(fill="grey")) +
    xlab('Binned interference') +
    ylab('Binned Intensity') +
    ggtitle(x)
})
```

```{r}

quant_vs_mean %>% names() %>% lapply(function(x){
  p <- quant_vs_mean[[x]] %>%
    select(id, species, binned_q, binned_average_sn, binned_interference) %>%
    unique() %>%
    group_by(species, binned_q, binned_average_sn, binned_interference) %>%
    tally() %>%
    ggplot(aes(binned_interference, n)) +
    geom_bar(stat='identity') +
    facet_wrap(~species, scales='free') +
    theme_camprot(base_size=15) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
    ggtitle(x)
  
  print(p)
  print(p + aes(binned_q))
  print(p + aes(binned_average_sn) +
          xlab('Signal/Noise'))
  
  return(NULL)
})
```


Based on the above, I'm going to use the following thresholds:

- Isolation interference <= 10%
- Percolator Q value <= 0.001

For now, we won't filer using SN since we want to explore the impact of the notch on fold changes in more detail first

First though, let's filter by Percolator Q value or Isolation interence alone and check how isolation interference affects PSM-level fold change estimates.
```{r, fig.height=9, fig.width=9}
quant_vs_mean %>% names() %>% lapply(function(x){
  
  to_plot_q_flt <- quant_vs_mean[[x]] %>%
    filter(Percolator.q.Value<=0.005) %>%
    filter(species=='S.cerevisiae') %>%
    filter(Isolation.Interference.in.Percent<50) %>%
    arrange(Isolation.Interference.in.Percent)
  
  to_plot_interference_flt <- quant_vs_mean[[x]] %>%
    filter(Isolation.Interference.in.Percent<=10) %>%
    filter(species=='S.cerevisiae') %>%
    arrange(Percolator.q.Value)
  
  p <- to_plot_q_flt %>%
    ggplot(aes(log2(intensity), diff)) +
    geom_point(size=0.1, alpha=0.1, colour='grey80') +
    theme_camprot(base_size=12) +
    facet_wrap(~comparison, scales='free_y') +
    geom_hline(aes(yintercept=log2(expected)),
               data=expected[expected$species=='S.cerevisiae',],
               colour='black', linetype=2) +
    xlab('Tag intensity (log2)') +
    ylab('Difference in intensity (log2)') +
    scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Isolation interference (%)') +
    ggtitle(x)
  
  print(p)
  print(p + geom_smooth(aes(colour=binned_interference), se=FALSE, size=0.5))
  print(p %+% to_plot_interference_flt +
          geom_smooth(aes(colour=binned_q), se=FALSE, size=0.5) +
          scale_colour_manual(values=c(get_cat_palette(7)),
                        name='Percolator Q value'))

  return(NULL)
})
```
Let's plot the intensity vs difference in intensity for all comparisons and filtering schemes.
```{r}
quant_vs_mean %>% names() %>% lapply(function(x){
  
    tmp_data <-  quant_vs_mean[[x]] %>%
      filter(species!='mixed', !comparison %in% positive_comparisons)
    
    p <- tmp_data %>%
      ggplot(aes(log2(intensity), diff)) +
      geom_point(size=0.1, alpha=0.05, colour='grey10') +
      geom_density_2d(size=0.25, colour=get_cat_palette(1)) +
      theme_camprot(base_size=12) +
      facet_grid(species~comparison, scales='free_y') +
      geom_hline(aes(yintercept=log2(expected)),
                 data=expected[!expected$comparison %in% positive_comparisons,],
                 colour='black', linetype=2) +
      xlab('Tag intensity (log2)') +
      ylab('Difference in intensity (log2)') +
      ggtitle(x) +
      coord_cartesian(ylim=c(-4,4))
  
    print(p)

    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005)))
    print(p %+% (tmp_data %>%
                   filter(Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10)))
    print(p %+% (tmp_data %>%
                   filter(Percolator.q.Value<=0.005, Isolation.Interference.in.Percent<=10, Average.Reporter.SN>10)))
    return(NULL)
})
```


Now, let's filter the PSMs against these thresholds.
```{r}
psm_res_flt <- psm_res %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=0)
  
  out <- out[fData(out)$Percolator.q.Value<=0.001,]
  camprotR:::message_parse(fData(out),
                           'Master.Protein.Accessions',
                           "Percolator Q value filtering")
  out
})

psm_res_flt_sn <- psm_res_flt %>% lapply(function(x){
  out <- filter_TMT_PSMs(x, inter_thresh=20, sn_thresh=10)
  out
})
```

```{r}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch() +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})

```

Per-tag notch plots
```{r, fig.height=8, fig.width=8}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      p <- slices[[slice]] %>% plot_TMT_notch(facet_by_sample=TRUE) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      print(p)
    }
  }
  
  return(NULL)
  
})
```

Tallies for fraction sub-notch PSMs per protein
```{r}

psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      
      notch_per_protein <- get_notch_per_protein(slices[[slice]]) %>%
        filter(fraction_below>0)
      
      p <- plot_fraction_below_notch_per_prot(notch_per_protein) +
        ggtitle(sprintf('%s - %s - %s', x, slice, dataset))
      
      print(p)
      }
  }
  
  return(NULL)
  
})


```


Missing values frequencies.
```{r}
psm_res %>% names() %>% lapply(function(x){
  
  datasets <- list('unfiltered'=psm_res, 'filtered'=psm_res_flt, '\nfiltered, inc S/N'=psm_res_flt_sn)
  
  for(dataset in names(datasets)){
    
    all <- datasets[[dataset]][[x]]
    hs <- all[fData(all)$species=='H.sapiens']
    sc <- all[fData(all)$species=='S.cerevisiae']
    
    slices <- list('All'=all, 'H.sapiens'=hs, 'S.cerevisiae'=sc)
    for(slice in names(slices)){
      plotNA(slices[[slice]], pNA = 0)
    }
  }
  
  return(NULL)
  
})

```

Save out objects for downstream notebooks
```{r}
saveRDS(quant_vs_mean, '../results/quant_vs_mean.rds')
saveRDS(psm_res_flt, '../results/psm_res_flt.rds')
saveRDS(psm_res_flt_sn, '../results/psm_res_flt_sn.rds')
saveRDS(expected, '../results/expected.rds')
```


